data_file_name <- "streaming_data.csv"
streaming_df <- read_csv(data_file_name)

Introduction

Good recommendation engine can be a key factor in improving user engagement. Streaming company Why Not Watch? (WNW) has developed a new recommendation engine algorithm and wanted to see if this new recommendation system is worth rolling out. Sample data has been given which shows effect of an A/B test. It was observed that there were bias in the data provided. Statistical analysis has been conducted to found out the effect of the new recommendation engine. From the analysis it has been noted that the overall hours watched had been increased in particular whose social metric belongs 5-10 with an older demographic.

Problem Statement

If the new recommendation engine algorithm is worth rolling out to all their subscribers. i.e, the new recommendation engine led to increase in the average hours watched per user per day.

I will be using linear regression and multi linear regression to find the effect of independant variables to dependant variable. Correlation coefficient to measure the relationship between variables(??쐓trength of linear association???). Conduct AB Testing to compare difference between two groups (A & B) to see if the new recoomendation engine led to increase in the hours watched.

Data

## List and explain the important variables. 
glimpse(streaming_df)
## Rows: 1,000
## Columns: 8
## $ date              <chr> "01/07", "01/07", "01/07", "01/07", "01/07", "01/07"…
## $ gender            <chr> "F", "F", "F", "M", "M", "M", "F", "M", "M", "M", "M…
## $ age               <dbl> 28, 32, 39, 52, 25, 51, 53, 42, 41, 20, 26, 25, 43, …
## $ social_metric     <dbl> 5, 7, 4, 10, 1, 0, 5, 6, 8, 7, 9, 5, 1, 10, 4, 9, 9,…
## $ time_since_signup <dbl> 19.3, 11.5, 4.3, 9.5, 19.5, 22.6, 4.2, 8.5, 16.9, 23…
## $ demographic       <dbl> 1, 1, 3, 4, 2, 4, 3, 4, 4, 2, 2, 1, 3, 1, 4, 4, 4, 4…
## $ group             <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A…
## $ hours_watched     <dbl> 4.08, 2.99, 5.74, 4.13, 4.68, 3.40, 3.07, 2.77, 2.24…
# Date : Period of observation 
# Gender : Gender of the cusotmer, F for Female, M for Male
# Age : Age of the customer
# Social_metric : combined betric based on previous viewing habits
# Time_since_signup : No. of months since the customer signed up
# Demographic : Demograhic number
# Group : A is the control group, B is the treated group
# Hours_watched : number of hours watched in that day


## Explain the scale of numeric variables.
# Date : Period of observation 1/7 - 31/7 (Interval)
streaming_df %>% distinct(date) %>% summarise(min = min(date), max = max(date))
## # A tibble: 1 x 2
##   min   max  
##   <chr> <chr>
## 1 01/07 31/07
# Age : Min age 18 , Max age 55 (Ordinal)
streaming_df %>% distinct(age) %>% summarise(min = min(age), max = max(age))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1    18    55
# Social_metric : 0 ~ 10 (Ordinal)
streaming_df %>% distinct(social_metric) %>% arrange(social_metric)
## # A tibble: 11 x 1
##    social_metric
##            <dbl>
##  1             0
##  2             1
##  3             2
##  4             3
##  5             4
##  6             5
##  7             6
##  8             7
##  9             8
## 10             9
## 11            10
# Time_since_signup : 0 months - 24 months (Interval)
streaming_df %>% summarise(min = min(time_since_signup), max = max(time_since_signup))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1     0    24
# Demographic : 1-4 (Norminal)
streaming_df %>% distinct(demographic)
## # A tibble: 4 x 1
##   demographic
##         <dbl>
## 1           1
## 2           3
## 3           4
## 4           2
# hours_watched : 0.5 hours - 8.3 hours / per day (Ratio)
streaming_df %>% summarise(min = min(hours_watched), max = max(hours_watched))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1   0.5   8.3

Examine the data to find any bias in sampling

streaming_a <- streaming_df %>% filter(group == 'A')
streaming_b <- streaming_df %>% filter(group == 'B')
streaming_a %>% group_by(gender) %>% summarise(n=n())
## # A tibble: 2 x 2
##   gender     n
##   <chr>  <int>
## 1 F        400
## 2 M        480
qplot(streaming_a$gender) +
  xlab("Gender") + ylab("Count")

streaming_b %>% group_by(gender) %>% summarise(n=n())
## # A tibble: 2 x 2
##   gender     n
##   <chr>  <int>
## 1 F         29
## 2 M         91
qplot(streaming_b$gender) +
  xlab("Gender") + ylab("Count")

streaming_a %>% filter(gender == 'F' ) %>% group_by(demographic) %>% summarise(n=n())
## # A tibble: 2 x 2
##   demographic     n
##         <dbl> <int>
## 1           1   203
## 2           3   197
streaming_a %>% filter(gender == 'M' ) %>% group_by(demographic) %>% summarise(n=n())
## # A tibble: 2 x 2
##   demographic     n
##         <dbl> <int>
## 1           2   236
## 2           4   244
gg <- ggplot(streaming_a, aes(x = demographic, fill = gender))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg

streaming_b %>% filter(gender == 'F' ) %>% group_by(demographic) %>% summarise(n=n())
## # A tibble: 2 x 2
##   demographic     n
##         <dbl> <int>
## 1           1    13
## 2           3    16
streaming_b %>% filter(gender == 'M' ) %>% group_by(demographic) %>% summarise(n=n())
## # A tibble: 2 x 2
##   demographic     n
##         <dbl> <int>
## 1           2    32
## 2           4    59
streaming_b %>% group_by(gender, demographic) %>% summarise(n=n())
## # A tibble: 4 x 3
## # Groups:   gender [2]
##   gender demographic     n
##   <chr>        <dbl> <int>
## 1 F                1    13
## 2 F                3    16
## 3 M                2    32
## 4 M                4    59
gg <- ggplot(streaming_b, aes(x = demographic, fill = gender))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg

streaming_a %>% group_by(age) %>% summarise(n=n())
## # A tibble: 38 x 2
##      age     n
##    <dbl> <int>
##  1    18    12
##  2    19    19
##  3    20    32
##  4    21    21
##  5    22    24
##  6    23    32
##  7    24    23
##  8    25    27
##  9    26    18
## 10    27    18
## # … with 28 more rows
streaming_b %>% group_by(age) %>% summarise(n=n())
## # A tibble: 37 x 2
##      age     n
##    <dbl> <int>
##  1    18     2
##  2    19     2
##  3    20     1
##  4    21     2
##  5    23     2
##  6    24     2
##  7    25     2
##  8    26     5
##  9    27     3
## 10    28     3
## # … with 27 more rows
gg <- ggplot(streaming_a, aes(x = age))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg

gg <- ggplot(streaming_b, aes(x = age))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg

streaming_a %>% group_by(age,gender) %>% summarise(n=n())
## # A tibble: 76 x 3
## # Groups:   age [38]
##      age gender     n
##    <dbl> <chr>  <int>
##  1    18 F          5
##  2    18 M          7
##  3    19 F         11
##  4    19 M          8
##  5    20 F         13
##  6    20 M         19
##  7    21 F          8
##  8    21 M         13
##  9    22 F         15
## 10    22 M          9
## # … with 66 more rows
gg <- ggplot(streaming_a, aes(x = age, fill = gender))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg

streaming_b %>% group_by(age,gender) %>% summarise(n=n())
## # A tibble: 55 x 3
## # Groups:   age [37]
##      age gender     n
##    <dbl> <chr>  <int>
##  1    18 F          1
##  2    18 M          1
##  3    19 F          1
##  4    19 M          1
##  5    20 M          1
##  6    21 F          1
##  7    21 M          1
##  8    23 M          2
##  9    24 M          2
## 10    25 M          2
## # … with 45 more rows
gg <- ggplot(streaming_b, aes(x = age, fill = gender))
gg <- gg + geom_histogram(alpha = 0.5, position = "identity", binwidth = 1)
gg

streaming_a %>% group_by(age, demographic) %>% summarise(n=n())
## # A tibble: 76 x 3
## # Groups:   age [38]
##      age demographic     n
##    <dbl>       <dbl> <int>
##  1    18           1     5
##  2    18           2     7
##  3    19           1    11
##  4    19           2     8
##  5    20           1    13
##  6    20           2    19
##  7    21           1     8
##  8    21           2    13
##  9    22           1    15
## 10    22           2     9
## # … with 66 more rows
q1 <- qplot(streaming_a$age, streaming_a$demographic, 
      colour = streaming_a$gender, labels = "Gender") +
      xlab("Age") + ylab("Demographic")
q1 + labs(fill = "Gender")

streaming_b %>% group_by(age, demographic) %>% summarise(n=n())
## # A tibble: 55 x 3
## # Groups:   age [37]
##      age demographic     n
##    <dbl>       <dbl> <int>
##  1    18           1     1
##  2    18           2     1
##  3    19           1     1
##  4    19           2     1
##  5    20           2     1
##  6    21           1     1
##  7    21           2     1
##  8    23           2     2
##  9    24           2     2
## 10    25           2     2
## # … with 45 more rows
qplot(streaming_b$age, streaming_b$demographic, 
      colour = streaming_b$gender) +
  xlab("Age") + ylab("Demographic")

b_date <- streaming_b %>% group_by(date) %>% summarise(n=n())

gg <- ggplot(data = b_date, aes(x=date, y=n))
gg <- gg + geom_bar(stat="identity", width = 0.5)
gg 

??? Is there any bias in the data collection? Yes, selection and Omitted Variable bias. Working with a specific subset of audience instead of the whole, rendering sample unrepresentative of the whole population. 1. For Group A & B, certain demographic has only has certain age groups. For example, Demographic 1 & 2 only has age group between 18-35 and Demographic 3 & 4 has age group between 35 over.
2. For Group A & B there is unequality in gender ratio. For group A, there are 480 males and 400 females. For group B, there are 91 males and 29 females (more than half difference!) 3. For Group A & B there is unequality in age ratio. 4. For Group A & B there is unequality in demographic ratio 5. For treatment group there is unequality in number of customers collected in each date (From 18th). There is a potential that hours watched might be affected by whether its weekend or weekdays.

??? How could any bias be corrected? To minimize the bias, the easiest way is to remove the extreme values from the data, for example, the very small data close to 0 or very maximum values. Also make sure that the sampling is completely randomised so that the data is collected from every age, gender, demographic groups. Also equal number of data from every date for sufficient period of time (minimum two weeks)

??? What improvements could be made to future A/B tests? Sufficient sample size. Should have enough data to make a confident call. Minimise bias and have randomised sampling. Have sufficent length of time (e.g, Multiple sales cycles (2 to 4 weeks)). If test is stopped within a few days (even after reaching the required sample size), you??셱e taking a convenient sample, not a representative sample.

Analysis

Linear regression

GROUP A - relationship between age & hours_watched

# x-axis / y-axis data
x <- streaming_a$age
y <- streaming_a$hours_watched

# number of points
n <- length(x)

# create a dataframe for x and y
data_df <- data.frame(x = x, y = y)

# quick plot
qplot(data_df$x, data_df$y)

# create a model called "fit1"
fit1 = lm(y ~ x, data = data_df) 

# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]
summary(fit1)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5142 -0.7242 -0.0030  0.7474  3.0046 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.980111   0.126542   55.16   <2e-16 ***
## x           -0.073137   0.003356  -21.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.067 on 878 degrees of freedom
## Multiple R-squared:  0.3511, Adjusted R-squared:  0.3503 
## F-statistic:   475 on 1 and 878 DF,  p-value: < 2.2e-16
a0 <- coef(fit1)[1]
a1 <- coef(fit1)[2]

Plot the fit over the data

# setup x variable with range of interest
xfit1 <- seq(min(data_df$x), max(data_df$x), 1)

# calculate the fitting function yfit based on the coefficients from the model
yfit1 <- a0 + a1 * xfit1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$y))
gg <- gg + geom_line(aes(x = xfit1, y = yfit1), colour = 'red')
gg <- gg + labs(x = 'x', y = 'y')
gg

fit1 %>% summary() %>% coef()
##              Estimate  Std. Error   t value      Pr(>|t|)
## (Intercept)  6.980111 0.126541629  55.16059 1.559230e-287
## x           -0.073137 0.003355869 -21.79376  1.641887e-84
fit1 %>% confint()
##                   2.5 %      97.5 %
## (Intercept)  6.73175125  7.22847005
## x           -0.07972346 -0.06655054

F-test on the best fit

# fit 1: a0 + a1 * xfit1
# which has 3 free parameters
n_paras <- 2

# degrees of freedom for error term
dfe <- n - n_paras

# total number of degrees of freedom
df_total <- n - 1

# number of degrees of freedom for regression (model 2)
df2 <- df_total - dfe

# calculate the mean of the observed data
y_mean <- mean(data_df$y)

# calculate the sum of squares terms
SSR <- sum( (data_df$f1 - y_mean)^2 )
SSE <- sum( (data_df$y - data_df$f1)^2 )

# calculate the F statistic
f_score_2 <- (SSR/df2) / (SSE/dfe)

# convert to p value
p_value_2 <- pf(f_score_2, df2, dfe, lower.tail = FALSE)
print(paste('F =', f_score_2))
## [1] "F = NaN"
print(paste('p-value =', p_value_2))
## [1] "p-value = NaN"
# compare to the in-built function in R
summary(fit1)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5142 -0.7242 -0.0030  0.7474  3.0046 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.980111   0.126542   55.16   <2e-16 ***
## x           -0.073137   0.003356  -21.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.067 on 878 degrees of freedom
## Multiple R-squared:  0.3511, Adjusted R-squared:  0.3503 
## F-statistic:   475 on 1 and 878 DF,  p-value: < 2.2e-16

This returns a p-value < 2e-16 so the null hypothesis that there is no relationship between the fitting function and the data is rejected.

Thus we can be confident that there is a statistically significant relationship.

Goodness of fit

print(paste('Best fit from lm: a1=', a1))
## [1] "Best fit from lm: a1= -0.0731369996521404"
a1_values <- seq(-0.5, 0.5, 0.01)
mse <- a1_values

for (i in seq(1, length(a1_values))){
  yfit <- a0 + a1_values[i] * data_df$x
  mse[i] <- sum((data_df$y - yfit)^2)
}
gg <- ggplot()
gg <- gg + geom_point(aes(x = a1_values, y = mse))
gg <- gg + labs(x = 'Value for a1 coefficient', y = 'MSE')
gg

Residuals

# fit to each value of x
data_df$f1 <- a0 + a1 * data_df$x

# calculate the residual
data_df$e1 <- data_df$y - data_df$f1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$e1))
gg <- gg + labs(x = 'x', y = 'residual')
gg

plot(fit1)

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = data_df$e1), bins = 10, colour = "black") 
gg <- gg + labs(title = "Residuals Plot", x = "Residuals") 
gg

A linear regression model was fitted to predict the dependent variable, number of hours watched in that day, using measures of customer's age as a single predictor. Prior to fitting the regression, a scatter plot assessing the bivariate relationship between hours watched and age was inspected. The scatter plot demonstrated evidence of a negative linear relationship. The overall regression model was statistically significant, F(1,878)=475, p<.001, and explained 35.11% of the variability in age, R2=0.3511. The estimated regression equation was y=6.980 - 0.073???x. The negative slope for age was statistically significant, b=-0.073, 95% CI [-0.094, -0.054]. Final inspection of the residuals supported normality and homoscedasticity.

Correlation

r=cor(streaming_a$age, streaming_a$hours_watched)
CIr(r = r, n = 880, level = .95)
## [1] -0.6337707 -0.5478657

This confidence interval does not capture H0, therefore, H0 was rejected. There was a statistically significant negative correlation between age and hours_watched

bivariate<-as.matrix(dplyr::select(streaming_a, age, hours_watched)) 
rcorr(bivariate, type = "pearson")
##                 age hours_watched
## age            1.00         -0.59
## hours_watched -0.59          1.00
## 
## n= 880 
## 
## 
## P
##               age hours_watched
## age                0           
## hours_watched  0

A Pearson correlation was calculated to measure the strength of the linear relationship between customers age and hours watched. The negative correlation was statistically significant, r=-0.59 , p<.001, 95% CI [-0.634, -0.548].

GROUP B - relationship between age & hours_watched

# x-axis / y-axis data
x <- streaming_b$age
y <- streaming_b$hours_watched

# number of points
n <- length(x)

# create a dataframe for x and y
data_df <- data.frame(x = x, y = y)

# quick plot
qplot(data_df$x, data_df$y)

# create a model called "fit1"
fit2 = lm(y ~ x, data = data_df) 

# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]
summary(fit2)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.13400 -0.76369  0.02025  0.78792  2.23803 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.68412    0.40368  19.035  < 2e-16 ***
## x           -0.07378    0.01004  -7.351 2.81e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.105 on 118 degrees of freedom
## Multiple R-squared:  0.3141, Adjusted R-squared:  0.3083 
## F-statistic: 54.04 on 1 and 118 DF,  p-value: 2.806e-11
b0 <- coef(fit2)[1]
b1 <- coef(fit2)[2]

Plot the fit over the data

# setup x variable with range of interest
xfit1 <- seq(min(data_df$x), max(data_df$x), 1)

# calculate the fitting function yfit based on the coefficients from the model
yfit1 <- b0 + b1 * xfit1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$y))
gg <- gg + geom_line(aes(x = xfit1, y = yfit1), colour = 'red')
gg <- gg + labs(x = 'x', y = 'y')
gg

fit2 %>% summary() %>% coef()
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  7.6841158 0.40367767 19.035276 9.009361e-38
## x           -0.0737832 0.01003707 -7.351069 2.806279e-11
fit2 %>% confint()
##                   2.5 %      97.5 %
## (Intercept)  6.88472410  8.48350748
## x           -0.09365933 -0.05390707

F-test on the best fit

# fit 1: a0 + a1 * xfit1
# which has 3 free parameters
n_paras <- 2

# degrees of freedom for error term
dfe <- n - n_paras

# total number of degrees of freedom
df_total <- n - 1

# number of degrees of freedom for regression (model 2)
df2 <- df_total - dfe

# calculate the mean of the observed data
y_mean <- mean(data_df$y)

# calculate the sum of squares terms
SSR <- sum( (data_df$f1 - y_mean)^2 )
SSE <- sum( (data_df$y - data_df$f1)^2 )

# calculate the F statistic
f_score_2 <- (SSR/df2) / (SSE/dfe)

# convert to p value
p_value_2 <- pf(f_score_2, df2, dfe, lower.tail = FALSE)
print(paste('F =', f_score_2))
## [1] "F = NaN"
print(paste('p-value =', p_value_2))
## [1] "p-value = NaN"
# compare to the in-built function in R
summary(fit2)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.13400 -0.76369  0.02025  0.78792  2.23803 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.68412    0.40368  19.035  < 2e-16 ***
## x           -0.07378    0.01004  -7.351 2.81e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.105 on 118 degrees of freedom
## Multiple R-squared:  0.3141, Adjusted R-squared:  0.3083 
## F-statistic: 54.04 on 1 and 118 DF,  p-value: 2.806e-11

This returns a p-value < 2e-16 so the null hypothesis that there is no relationship between the fitting function and the data is rejected.

Thus we can be confident that there is a statistically significant relationship.

Goodness of fit

print(paste('Best fit from lm: b1=', b1))
## [1] "Best fit from lm: b1= -0.0737832003247987"
b1_values <- seq(-0.5, 0.5, 0.01)
mse <- b1_values

for (i in seq(1, length(b1_values))){
  yfit <- b0 + b1_values[i] * data_df$x
  mse[i] <- sum((data_df$y - yfit)^2)
}
gg <- ggplot()
gg <- gg + geom_point(aes(x = b1_values, y = mse))
gg <- gg + labs(x = 'Value for b1 coefficient', y = 'MSE')
gg

Residuals

# fit to each value of x
data_df$f1 <- b0 + b1 * data_df$x

# calculate the residual
data_df$e1 <- data_df$y - data_df$f1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$e1))
gg <- gg + labs(x = 'x', y = 'residual')
gg

plot(fit2)

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = data_df$e1), bins = 10, colour = "black") 
gg <- gg + labs(title = "Residuals Plot", x = "Residuals") 
gg

A linear regression model was fitted to predict the dependent variable, number of hours watched in that day, using measures of customer's age as a single predictor. Prior to fitting the regression, a scatter plot assessing the bivariate relationship between hours watched and age was inspected. The scatter plot demonstrated evidence of a negative linear relationship. The overall regression model was statistically significant, F(1,118)=54.04, p<.001, and explained 31.41% of the variability in age, R2=0.3141. The estimated regression equation was y=7.684 - 0.074???x. The negative slope for age was statistically significant, b=-0.074, 95% CI [-0.094, -0.054]. Final inspection of the residuals supported normality and homoscedasticity.

Correlation

bivariate<-as.matrix(dplyr::select(streaming_b, age, hours_watched)) 
rcorr(bivariate, type = "pearson")
##                 age hours_watched
## age            1.00         -0.56
## hours_watched -0.56          1.00
## 
## n= 120 
## 
## 
## P
##               age hours_watched
## age                0           
## hours_watched  0

This confidence interval does not capture H0, therefore, H0 was rejected. There was a statistically significant negative correlation between age and hours_watched

r=cor(streaming_b$age, streaming_b$hours_watched)
CIr(r = r, n = 120, level = .95)
## [1] -0.6721693 -0.4237816

A Pearson??셲 correlation was calculated to measure the strength of the linear relationship between customer's age and hours watched. The negative correlation was statistically significant, r=-0.56 , p<.001, 95% CI [-0.672, -0.424].

GROUP A - relationship between social_metric & hours_watched

# x-axis / y-axis data
x <- streaming_a$social_metric
y <- streaming_a$hours_watched

# number of points
n <- length(x)

# create a dataframe for x and y
data_df <- data.frame(x = x, y = y)

# quick plot
qplot(data_df$x, data_df$y)

# create a model called "fit1"
fit3 = lm(y ~ x, data = data_df) 

# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]
summary(fit3)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7544 -0.8255  0.0273  0.8349  3.7632 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.87785    0.08295  46.747  < 2e-16 ***
## x            0.09414    0.01449   6.495 1.39e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.294 on 878 degrees of freedom
## Multiple R-squared:  0.04585,    Adjusted R-squared:  0.04476 
## F-statistic: 42.19 on 1 and 878 DF,  p-value: 1.386e-10
c0 <- coef(fit3)[1]
c1 <- coef(fit3)[2]

Plot the fit over the data

# setup x variable with range of interest
xfit1 <- seq(min(data_df$x), max(data_df$x), 1)

# calculate the fitting function yfit based on the coefficients from the model
yfit1 <- c0 + c1 * xfit1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$y))
gg <- gg + geom_line(aes(x = xfit1, y = yfit1), colour = 'red')
gg <- gg + labs(x = 'x', y = 'y')
gg

fit3 %>% summary() %>% coef()
##               Estimate Std. Error   t value      Pr(>|t|)
## (Intercept) 3.87785183 0.08295454 46.746710 1.831933e-240
## x           0.09413641 0.01449303  6.495289  1.386127e-10
fit3 %>% confint()
##                  2.5 %    97.5 %
## (Intercept) 3.71503948 4.0406642
## x           0.06569138 0.1225814

F-test on the best fit

# fit 1: a0 + a1 * xfit1
# which has 3 free parameters
n_paras <- 2

# degrees of freedom for error term
dfe <- n - n_paras

# total number of degrees of freedom
df_total <- n - 1

# number of degrees of freedom for regression (model 2)
df2 <- df_total - dfe

# calculate the mean of the observed data
y_mean <- mean(data_df$y)

# calculate the sum of squares terms
SSR <- sum( (data_df$f1 - y_mean)^2 )
SSE <- sum( (data_df$y - data_df$f1)^2 )

# calculate the F statistic
f_score_2 <- (SSR/df2) / (SSE/dfe)

# convert to p value
p_value_2 <- pf(f_score_2, df2, dfe, lower.tail = FALSE)
print(paste('F =', f_score_2))
## [1] "F = NaN"
print(paste('p-value =', p_value_2))
## [1] "p-value = NaN"
# compare to the in-built function in R
summary(fit3)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7544 -0.8255  0.0273  0.8349  3.7632 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.87785    0.08295  46.747  < 2e-16 ***
## x            0.09414    0.01449   6.495 1.39e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.294 on 878 degrees of freedom
## Multiple R-squared:  0.04585,    Adjusted R-squared:  0.04476 
## F-statistic: 42.19 on 1 and 878 DF,  p-value: 1.386e-10

This returns a p-value < 2e-16 so the null hypothesis that there is no relationship between the fitting function and the data is rejected.

Thus we can be confident that there is a statistically significant relationship.

Goodness of fit

print(paste('Best fit from lm: c1=', c1))
## [1] "Best fit from lm: c1= 0.0941364119295324"
c1_values <- seq(-0.5, 0.5, 0.01)
mse <- c1_values

for (i in seq(1, length(c1_values))){
  yfit <- c0 + c1_values[i] * data_df$x
  mse[i] <- sum((data_df$y - yfit)^2)
}
gg <- ggplot()
gg <- gg + geom_point(aes(x = c1_values, y = mse))
gg <- gg + labs(x = 'Value for c1 coefficient', y = 'MSE')
gg

Residuals

# fit to each value of x
data_df$f1 <- c0 + c1 * data_df$x

# calculate the residual
data_df$e1 <- data_df$y - data_df$f1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$e1))
gg <- gg + labs(x = 'x', y = 'residual')
gg

plot(fit3)

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = data_df$e1), bins = 10, colour = "black") 
gg <- gg + labs(title = "Residuals Plot", x = "Residuals") 
gg

A linear regression model was fitted to predict the dependent variable, number of hours watched in that day, using measures of customer's social_metric as a single predictor. Prior to fitting the regression, a scatter plot assessing the bivariate relationship between hours watched and social_metric was inspected. The scatter plot demonstrated evidence of a positive linear relationship. The overall regression model was statistically significant, F(1,878)=42.19, p<.001, and explained 4.59% of the variability in age, R2=0.046. The estimated regression equation was y=3.878 + 0.094???x. The positive slope for social_metric was statistically significant, b=0.094, 95% CI [ 0.066, 0.123]. Final inspection of the residuals supported normality and homoscedasticity.

Correlation

bivariate<-as.matrix(dplyr::select(streaming_a, social_metric, hours_watched)) 
rcorr(bivariate, type = "pearson")
##               social_metric hours_watched
## social_metric          1.00          0.21
## hours_watched          0.21          1.00
## 
## n= 880 
## 
## 
## P
##               social_metric hours_watched
## social_metric                0           
## hours_watched  0

This confidence interval does not capture H0, therefore, H0 was rejected. There was a statistically positive correlation between social_metrics and hours_watched

r=cor(streaming_a$social_metric, streaming_a$hours_watched)
CIr(r = r, n = 880, level = .95)
## [1] 0.1501594 0.2762984

A Pearson??셲 correlation was calculated to measure the strength of the linear relationship between customer's social_metrics and hours watched. The positive correlation was statistically significant, r=0.21 , p<.001, 95% CI [ 0.150, 0.276].

GROUP B - relationship between social_metric & hours_watched

# x-axis / y-axis data
x <- streaming_b$social_metric
y <- streaming_b$hours_watched

# number of points
n <- length(x)

# create a dataframe for x and y
data_df <- data.frame(x = x, y = y)

# quick plot
qplot(data_df$x, data_df$y)

# create a model called "fit1"
fit4 = lm(y ~ x, data = data_df) 

# contains coeff [1]=intercept, [2]=slope; find with coef(fit)[1]
summary(fit4)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8525 -0.8858  0.1861  0.7691  3.0019 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.94574    0.23817  16.567  < 2e-16 ***
## x            0.16558    0.04003   4.136 6.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.247 on 118 degrees of freedom
## Multiple R-squared:  0.1266, Adjusted R-squared:  0.1192 
## F-statistic: 17.11 on 1 and 118 DF,  p-value: 6.654e-05
d0 <- coef(fit4)[1]
d1 <- coef(fit4)[2]

Plot the fit over the data

# setup x variable with range of interest
xfit1 <- seq(min(data_df$x), max(data_df$x), 1)

# calculate the fitting function yfit based on the coefficients from the model
yfit1 <- d0 + d1 * xfit1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$y))
gg <- gg + geom_line(aes(x = xfit1, y = yfit1), colour = 'red')
gg <- gg + labs(x = 'x', y = 'y')
gg

fit4 %>% summary() %>% coef()
##              Estimate Std. Error   t value     Pr(>|t|)
## (Intercept) 3.9457431 0.23816724 16.567111 1.405089e-32
## x           0.1655755 0.04003423  4.135847 6.653896e-05
fit4 %>% confint()
##                 2.5 %    97.5 %
## (Intercept) 3.4741071 4.4173791
## x           0.0862968 0.2448542

F-test on the best fit

# fit 1: a0 + a1 * xfit1
# which has 3 free parameters
n_paras <- 2

# degrees of freedom for error term
dfe <- n - n_paras

# total number of degrees of freedom
df_total <- n - 1

# number of degrees of freedom for regression (model 2)
df2 <- df_total - dfe

# calculate the mean of the observed data
y_mean <- mean(data_df$y)

# calculate the sum of squares terms
SSR <- sum( (data_df$f1 - y_mean)^2 )
SSE <- sum( (data_df$y - data_df$f1)^2 )

# calculate the F statistic
f_score_2 <- (SSR/df2) / (SSE/dfe)

# convert to p value
p_value_2 <- pf(f_score_2, df2, dfe, lower.tail = FALSE)
print(paste('F =', f_score_2))
## [1] "F = NaN"
print(paste('p-value =', p_value_2))
## [1] "p-value = NaN"
# compare to the in-built function in R
summary(fit4)
## 
## Call:
## lm(formula = y ~ x, data = data_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8525 -0.8858  0.1861  0.7691  3.0019 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.94574    0.23817  16.567  < 2e-16 ***
## x            0.16558    0.04003   4.136 6.65e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.247 on 118 degrees of freedom
## Multiple R-squared:  0.1266, Adjusted R-squared:  0.1192 
## F-statistic: 17.11 on 1 and 118 DF,  p-value: 6.654e-05

This returns a p-value < 2e-16 so the null hypothesis that there is no relationship between the fitting function and the data is rejected.

Thus we can be confident that there is a statistically significant relationship.

Goodness of fit

print(paste('Best fit from lm: d1=', d1))
## [1] "Best fit from lm: d1= 0.165575482143317"
d1_values <- seq(-0.5, 0.5, 0.01)
mse <- d1_values

for (i in seq(1, length(d1_values))){
  yfit <- c0 + d1_values[i] * data_df$x
  mse[i] <- sum((data_df$y - yfit)^2)
}
gg <- ggplot()
gg <- gg + geom_point(aes(x = d1_values, y = mse))
gg <- gg + labs(x = 'Value for d1 coefficient', y = 'MSE')
gg

Residuals

# fit to each value of x
data_df$f1 <- d0 + d1 * data_df$x

# calculate the residual
data_df$e1 <- data_df$y - data_df$f1

gg <- ggplot()
gg <- gg + geom_point(aes(x = data_df$x, y = data_df$e1))
gg <- gg + labs(x = 'x', y = 'residual')
gg

plot(fit4)

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = data_df$e1), bins = 10, colour = "black") 
gg <- gg + labs(title = "Residuals Plot", x = "Residuals") 
gg

A linear regression model was fitted to predict the dependent variable, number of hours watched in that day, using measures of customer's social_metric as a single predictor. Prior to fitting the regression, a scatter plot assessing the bivariate relationship between hours watched and social_metric was inspected. The scatter plot demonstrated evidence of a positive linear relationship. The overall regression model was statistically significant, F(1,118)=17.11, p<.001, and explained 12.66% of the variability in age, R2=0.1266. The estimated regression equation was y=3.946 + 0.166???x. The positive slope for social_metric was statistically significant, b= 0.165, 95% CI [0.086, 0.245]. Final inspection of the residuals supported normality and homoscedasticity.

Correlation

bivariate<-as.matrix(dplyr::select(streaming_b, social_metric, hours_watched)) 
rcorr(bivariate, type = "pearson")
##               social_metric hours_watched
## social_metric          1.00          0.36
## hours_watched          0.36          1.00
## 
## n= 120 
## 
## 
## P
##               social_metric hours_watched
## social_metric                0           
## hours_watched  0

This confidence interval does not capture H0, therefore, H0 was rejected. There was a statistically positive correlation between social_metrics and hours_watched

r=cor(streaming_b$social_metric, streaming_b$hours_watched)
CIr(r = r, n = 120, level = .95)
## [1] 0.1886059 0.5029810

A Pearson??셲 correlation was calculated to measure the strength of the linear relationship between customer's social_metrics and hours watched. The positive correlation was statistically significant, r=0.36 , p<.001, 95% CI [ 0.189, 0.503].

Multiple Regression

GROUP A: Age & Social Metric

gg <- ggplot()
gg <- gg + geom_point(aes(x = streaming_a$age, y = streaming_a$social_metric,
                          colour = streaming_a$hours_watched, size = streaming_a$hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Social Metrics')
gg

# allocate colour by Hours(watched) bands
streaming_a %>% summarise(min = min(hours_watched), max = max(hours_watched))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1   0.5   8.3
streaming_a$streaming_colour <- case_when(
  streaming_a$hours_watched <= 3 ~ 1,
  streaming_a$hours_watched <= 6 ~ 2,
  streaming_a$hours_watched  > 6 ~ 3
)


gg <- ggplot(data = streaming_a)
gg <- gg + geom_point(aes(x = age, y = social_metric,
                          colour = factor(streaming_colour), size = hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Social Metric',
                colour = 'Hours band', size = 'Hours watched')
gg

reg_model_1a <- lm(hours_watched ~ age + social_metric, data = streaming_a) 

summary(reg_model_1a)
## 
## Call:
## lm(formula = hours_watched ~ age + social_metric, data = streaming_a)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6244 -0.6361 -0.0271  0.6988  2.8773 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.535941   0.137147  47.657  < 2e-16 ***
## age           -0.072279   0.003262 -22.157  < 2e-16 ***
## social_metric  0.084869   0.011619   7.305 6.25e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.037 on 877 degrees of freedom
## Multiple R-squared:  0.3883, Adjusted R-squared:  0.3869 
## F-statistic: 278.3 on 2 and 877 DF,  p-value: < 2.2e-16
# create new model including gender
reg_model_2a <- lm(hours_watched ~ age + social_metric + gender,
                 data = streaming_a) 

# perform anova test
anova(reg_model_1a, reg_model_2a)
## Analysis of Variance Table
## 
## Model 1: hours_watched ~ age + social_metric
## Model 2: hours_watched ~ age + social_metric + gender
##   Res.Df   RSS Df Sum of Sq      F Pr(>F)
## 1    877 942.9                           
## 2    876 942.9  1 0.0080154 0.0074 0.9313

This does indicate that it is not worth adding gender

What is the accuracy?

We can use the coefficient of determination (R^2) to gauge the accuracy of a regression model.

summary(reg_model_1a)
## 
## Call:
## lm(formula = hours_watched ~ age + social_metric, data = streaming_a)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6244 -0.6361 -0.0271  0.6988  2.8773 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.535941   0.137147  47.657  < 2e-16 ***
## age           -0.072279   0.003262 -22.157  < 2e-16 ***
## social_metric  0.084869   0.011619   7.305 6.25e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.037 on 877 degrees of freedom
## Multiple R-squared:  0.3883, Adjusted R-squared:  0.3869 
## F-statistic: 278.3 on 2 and 877 DF,  p-value: < 2.2e-16

In this case R2=0.3883 which is not particularly accurate, but better than a pure guess.

GROUP B: Age & Social Metric

gg <- ggplot()
gg <- gg + geom_point(aes(x = streaming_b$age, y = streaming_b$social_metric,
                          colour = streaming_b$hours_watched, size = streaming_b$hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Social Metrics')
gg

# allocate colour by Hours(watched) bands
streaming_b %>% summarise(min = min(hours_watched), max = max(hours_watched))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1  1.52  7.93
streaming_b$streaming_colour <- case_when(
  streaming_b$hours_watched <= 3 ~ 1,
  streaming_b$hours_watched <= 6 ~ 2,
  streaming_b$hours_watched  > 6 ~ 3
)


gg <- ggplot(data = streaming_b)
gg <- gg + geom_point(aes(x = age, y = social_metric,
                          colour = factor(streaming_colour), size = hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Social Metric',
                colour = 'Hours band', size = 'Hours watched')
gg

reg_model_1b <- lm(hours_watched ~ age + social_metric, data = streaming_b) 

summary(reg_model_1b)
## 
## Call:
## lm(formula = hours_watched ~ age + social_metric, data = streaming_b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.65282 -0.61812  0.06309  0.68267  1.80700 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.840745   0.391174  17.488  < 2e-16 ***
## age           -0.075783   0.008972  -8.446 9.47e-14 ***
## social_metric  0.176314   0.031714   5.560 1.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9874 on 117 degrees of freedom
## Multiple R-squared:  0.4574, Adjusted R-squared:  0.4482 
## F-statistic: 49.32 on 2 and 117 DF,  p-value: 2.92e-16
# create new model including gender
reg_model_2b <- lm(hours_watched ~ age + social_metric + gender,
                 data = streaming_b) 

# perform anova test
anova(reg_model_1b, reg_model_2b)
## Analysis of Variance Table
## 
## Model 1: hours_watched ~ age + social_metric
## Model 2: hours_watched ~ age + social_metric + gender
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    117 114.07                           
## 2    116 114.05  1  0.019333 0.0197 0.8887

This does indicate that it is not worth adding gender

What is the accuracy?

We can use the coefficient of determination (R^2) to gauge the accuracy of a regression model.

summary(reg_model_1b)
## 
## Call:
## lm(formula = hours_watched ~ age + social_metric, data = streaming_b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.65282 -0.61812  0.06309  0.68267  1.80700 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.840745   0.391174  17.488  < 2e-16 ***
## age           -0.075783   0.008972  -8.446 9.47e-14 ***
## social_metric  0.176314   0.031714   5.560 1.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9874 on 117 degrees of freedom
## Multiple R-squared:  0.4574, Adjusted R-squared:  0.4482 
## F-statistic: 49.32 on 2 and 117 DF,  p-value: 2.92e-16

In this case R2=0.4574 which is not particularly accurate, but better than a pure guess.

GROUP A : Gender & Age

gg <- ggplot()
gg <- gg + geom_point(aes(x = streaming_a$age, y = streaming_a$gender,
                          colour = streaming_a$hours_watched, size = streaming_a$hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Gender')
gg

# allocate colour by Hours(watched) bands
streaming_a %>% summarise(min = min(hours_watched), max = max(hours_watched))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1   0.5   8.3
streaming_a$streaming_colour <- case_when(
  streaming_a$hours_watched <= 3 ~ 1,
  streaming_a$hours_watched <= 6 ~ 2,
  streaming_a$hours_watched  > 6 ~ 3
)


streaming_a$gender_no <- case_when(
  streaming_a$gender == "M" ~ 0,
  streaming_a$gender == "F" ~ 1
)

gg <- ggplot(data = streaming_a)
gg <- gg + geom_point(aes(x = age, y = gender,
                          colour = factor(streaming_colour), size = hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Gender',
                colour = 'Hours band', size = 'Hours watched')
gg

reg_model_3a <- lm(hours_watched ~ age + gender, data = streaming_a) 

summary(reg_model_3a)
## 
## Call:
## lm(formula = hours_watched ~ age + gender, data = streaming_a)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5012 -0.7161 -0.0025  0.7546  2.9941 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.966822   0.133080  52.351   <2e-16 ***
## age         -0.073123   0.003358 -21.777   <2e-16 ***
## genderM      0.023433   0.072304   0.324    0.746    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.068 on 877 degrees of freedom
## Multiple R-squared:  0.3511, Adjusted R-squared:  0.3497 
## F-statistic: 237.3 on 2 and 877 DF,  p-value: < 2.2e-16
# create new model including social metric
reg_model_4a <- lm(hours_watched ~ gender + age + social_metric,
                 data = streaming_a) 


# perform anova test
anova(reg_model_3a, reg_model_4a)
## Analysis of Variance Table
## 
## Model 1: hours_watched ~ age + gender
## Model 2: hours_watched ~ gender + age + social_metric
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    877 1000.1                                  
## 2    876  942.9  1    57.254 53.192 6.769e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This does indicate that it is worth adding social_metric

What is the accuracy?

We can use the coefficient of determination (R^2) to gauge the accuracy of a regression model.

summary(reg_model_4a)
## 
## Call:
## lm(formula = hours_watched ~ gender + age + social_metric, data = streaming_a)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6271 -0.6337 -0.0293  0.7011  2.8744 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.532680   0.142334  45.897  < 2e-16 ***
## genderM        0.006065   0.070284   0.086    0.931    
## age           -0.072276   0.003264 -22.142  < 2e-16 ***
## social_metric  0.084835   0.011632   7.293 6.77e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.037 on 876 degrees of freedom
## Multiple R-squared:  0.3883, Adjusted R-squared:  0.3862 
## F-statistic: 185.3 on 3 and 876 DF,  p-value: < 2.2e-16

GROUP B : Gender & Age

gg <- ggplot()
gg <- gg + geom_point(aes(x = streaming_b$age, y = streaming_b$gender,
                          colour = streaming_b$hours_watched, size = streaming_b$hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Gender')
gg

# allocate colour by Hours(watched) bands
streaming_b %>% summarise(min = min(hours_watched), max = max(hours_watched))
## # A tibble: 1 x 2
##     min   max
##   <dbl> <dbl>
## 1  1.52  7.93
streaming_b$streaming_colour <- case_when(
  streaming_b$hours_watched <= 3 ~ 1,
  streaming_b$hours_watched <= 6 ~ 2,
  streaming_b$hours_watched  > 6 ~ 3
)


gg <- ggplot(data = streaming_b)
gg <- gg + geom_point(aes(x = age, y = gender,
                          colour = factor(streaming_colour), size = hours_watched))
gg <- gg + labs(x = 'Age',
                y = 'Social Metric',
                colour = 'Hours band', size = 'Hours watched')
gg

reg_model_5b <- lm(hours_watched ~ age + gender, data = streaming_b) 

summary(reg_model_5b)
## 
## Call:
## lm(formula = hours_watched ~ age + gender, data = streaming_b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3.13293 -0.76250  0.02137  0.78897  2.23929 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.687047   0.433327  17.740  < 2e-16 ***
## age         -0.073770   0.010103  -7.301 3.74e-11 ***
## genderM     -0.004544   0.237292  -0.019    0.985    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.11 on 117 degrees of freedom
## Multiple R-squared:  0.3141, Adjusted R-squared:  0.3024 
## F-statistic: 26.79 on 2 and 117 DF,  p-value: 2.636e-10
# create new model including temperature
reg_model_6b <- lm(hours_watched ~ age + gender + social_metric,
                 data = streaming_b) 

# perform anova test
anova(reg_model_5b, reg_model_6b)
## Analysis of Variance Table
## 
## Model 1: hours_watched ~ age + gender
## Model 2: hours_watched ~ age + gender + social_metric
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    117 144.20                                  
## 2    116 114.05  1    30.153 30.668 1.935e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This does indicate that it is worth adding social_metric parameter

What is the accuracy?

We can use the coefficient of determination (R^2) to gauge the accuracy of a regression model.

summary(reg_model_6b)
## 
## Call:
## lm(formula = hours_watched ~ age + gender + social_metric, data = streaming_b)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.64527 -0.62527  0.06564  0.68931  1.81567 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.859460   0.414876  16.534  < 2e-16 ***
## age           -0.075697   0.009031  -8.382 1.40e-13 ***
## genderM       -0.029726   0.211986  -0.140    0.889    
## social_metric  0.176409   0.031855   5.538 1.94e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9916 on 116 degrees of freedom
## Multiple R-squared:  0.4575, Adjusted R-squared:  0.4435 
## F-statistic: 32.61 on 3 and 116 DF,  p-value: 2.319e-15

In this case R2=0.4575 which is not particularly accurate, but better than a pure guess.

Prediction based on regression model

In this case we can use our regression model to predict the number of hours watched with new recommendation engine algorithm by particular age + gender + social_metric

new_data_df <- data.frame(age = c(35,40, 27, 60),
                          gender = c("F","M","F","M"),
                          social_metric  = c(2,5,8,9))

predict(reg_model_6b, new_data_df)
##        1        2        3        4 
## 4.562871 4.683886 6.226906 3.875576

AB TESTING

Two-sample hypothesis test. Our null hypothesis H0 is that the two groups A and B have the same efficacy, i.e. that they produce an equivalent number of hours watched in that day.

H0:μ1=μ2

The statistical significance is then measured by the p-value, i.e. the probability of observing a discrepancy between our samples at least as strong as the one that we actually observed.

Alternative hypothesis Ha is that there is a difference a sifference in number of hours watched between two groups.

HA:μ1μ2
A two-tailed test is preferable in our case, since we have no reason to know a priori whether the discrepancy between the results of A and B will be in favor of A or B. This means that we consider the alternative hypothesis Ha the hypothesis that A and B have different efficacy.

## Minimum sample size
sd <- sd(streaming_df$hours_watched)
n <- length(streaming_df)
z_alpha <- 1.96

moe <- z_alpha * sd / sqrt(n)

min_n <- ((z_alpha * sd)/moe)^2

print(paste('Min sample size', min_n))
## [1] "Min sample size 8"
e <- ggplot(streaming_df, aes(x = gender, y = hours_watched))
e2 <- e + geom_boxplot(
  aes(fill = group),
  position = position_dodge(0.9) 
  ) +
  scale_fill_manual(values = c("#999999", "#E69F00"))
e2

e <- ggplot(streaming_df, aes(x = age, y = hours_watched))
e2 <- e + geom_boxplot(
  aes(fill = group),
  position = position_dodge(0.9) 
  ) +
  scale_fill_manual(values = c("#999999", "#E69F00"))
e2 + facet_wrap(~group)

e2

streaming_df$age_group <- case_when(
  streaming_df$age <= 19 ~ "10~19",
  streaming_df$age <= 29 ~ "20~29",
  streaming_df$age <= 39 ~ "30~39",
  streaming_df$age <= 49 ~ "40~49",
  streaming_df$age <= 59 ~ "50~59",
)


e <- ggplot(streaming_df, aes(x = age_group, y = hours_watched))
e2 <- e + geom_boxplot(
  aes(fill = group),
  position = position_dodge(0.9) 
  ) +
  scale_fill_manual(values = c("#999999", "#E69F00"))
e2 + facet_wrap(~group)

e2

streaming_df$social_metric_g <- case_when(
  streaming_df$social_metric <= 2 ~ "<= 2",
  streaming_df$social_metric <= 4 ~ "<= 4",
  streaming_df$social_metric <= 6 ~ "<= 6",
  streaming_df$social_metric <= 8 ~ "<= 8",
  streaming_df$social_metric <= 10 ~ "<= 10",
)

e <- ggplot(streaming_df, aes(x = social_metric_g, y = hours_watched))
e2 <- e + geom_boxplot(
  aes(fill = group),
  position = position_dodge(0.9) 
  ) +
  scale_fill_manual(values = c("#999999", "#E69F00"))
e2 + facet_wrap(~group)

e2

streaming_df$hours_outcome <- case_when(
  streaming_df$hours_watched <= 4.5 ~ 0,
  streaming_df$hours_watched  > 4.5 ~ 1
)


age_cut_above <- 29
age_cut_below <- 40

# group above/below the line
streaming_df$above <- streaming_df$social_metric > 5

# create young/old groups with different splits depending on the above/below group
streaming_df$young <- case_when(
  streaming_df$above ~ streaming_df$age <= age_cut_above,
  TRUE ~ streaming_df$age <= age_cut_below
)

summary_df <- streaming_df %>% dplyr:: select(above, young) %>%
  group_by(above, young) %>% mutate(n = n()) %>% distinct()

Check group balances

# count the numbers in each demographic category based on the A/B group
check_a_df <- streaming_df %>% 
  filter(group == 'A') %>% 
  dplyr:: select(gender, above, young) %>% 
  group_by(gender, above, young) %>% 
  mutate(n_a = n()) %>% 
  distinct()

check_b_df <- streaming_df %>% 
  filter(group == 'B') %>% 
  dplyr:: select(gender, above, young) %>% 
  group_by(gender, above, young) %>% 
  mutate(n_b = n()) %>% 
  distinct()

# total numbers in each group
n_total_a <- sum(streaming_df$group == 'A')
n_total_b <- sum(streaming_df$group == 'B')

# proportions in each demographic
check_a_df$p_a <- check_a_df$n_a / n_total_a
check_b_df$p_b <- check_b_df$n_b / n_total_b

# join on demo categories
check_df <- inner_join(check_a_df, check_b_df)

# calculate the difference in proportions
check_df$diff <- check_df$p_a - check_df$p_b

# if there is no bias aside from sampling noise then the difference should be small and normally distributed
qqnorm(y = check_df$diff)

Examine the effect size

print('Outcome breakdown:')
## [1] "Outcome breakdown:"
cond_A <- streaming_df$group == 'A'
print(paste('A:', sum(streaming_df$hours_watched[cond_A]) / sum(cond_A)))
## [1] "A: 4.336125"
cond_B <- streaming_df$group == 'B'
print(paste('B:', sum(streaming_df$hours_watched[cond_B]) / sum(cond_A)))
## [1] "B: 0.656028409090909"

Statistical significance tests

Break the demographics and statistics by each group and check for their significances. Looking at each group outcomes side by side for each demographic defined by the categories: gender, above, and young

# prepare data for group A
g_a <- streaming_df %>% 
  filter(group == 'A') %>% 
  ungroup() %>% 
  dplyr:: select(gender, above, young, hours_outcome) %>% 
  group_by(gender, above, young) %>%
  mutate(n_a = n(), n_a_o = sum(hours_outcome)) %>% 
  dplyr:: select(gender, above, young, n_a, n_a_o) %>%
  distinct()

g_a$p_a <- g_a$n_a_o / g_a$n_a

g_b <- streaming_df %>% 
  filter(group == 'B') %>% 
  ungroup() %>% 
  dplyr:: select(gender, above, young, hours_outcome) %>% 
  group_by(gender, above, young) %>%
  mutate(n_b = n(), n_b_o = sum(hours_outcome)) %>% 
  dplyr:: select(gender, above, young, n_b, n_b_o) %>%
  distinct()

g_b$p_b <- g_b$n_b_o / g_b$n_b

# effect comparison: join on all common column names
effect_comp_df <- inner_join(g_a, g_b)

effect_comp_df$effect <- effect_comp_df$p_b - effect_comp_df$p_a


pop_sd <- sd(g_a$p_a)

# required sample size
z_alpha <- 1.96
effect_comp_df$n_ss <- (z_alpha * pop_sd / effect_comp_df$effect)^2

effect_comp_df$significant <- effect_comp_df$n_a > effect_comp_df$n_ss

effect_comp_df
## # A tibble: 8 x 12
## # Groups:   gender, above, young [8]
##   gender above young   n_a n_a_o   p_a   n_b n_b_o   p_b effect  n_ss
##   <chr>  <lgl> <lgl> <int> <dbl> <dbl> <int> <dbl> <dbl>  <dbl> <dbl>
## 1 F      FALSE TRUE    138    81 0.587    11     9 0.818 0.231   5.46
## 2 F      TRUE  FALSE   115    50 0.435    11     8 0.727 0.292   3.41
## 3 M      TRUE  FALSE   142    54 0.380    36    25 0.694 0.314   2.96
## 4 M      FALSE TRUE    167    87 0.521    20    13 0.65  0.129  17.5 
## 5 M      FALSE FALSE   101    14 0.139    26     6 0.231 0.0922 34.3 
## 6 F      FALSE FALSE    95    11 0.116     4     1 0.25  0.134  16.2 
## 7 M      TRUE  TRUE     70    55 0.786     9     9 1     0.214   6.35
## 8 F      TRUE  TRUE     52    46 0.885     3     3 1     0.115  21.9 
## # … with 1 more variable: significant <lgl>
plot_df <- effect_comp_df

plot_df$demo_y <- ifelse(plot_df$young, 'Young', 'Old')
plot_df$demo_g <- ifelse(plot_df$gender == 'M', 'Male', 'Female')

plot_df$demo <- paste(plot_df$demo_y, plot_df$demo_g)

gg <- ggplot(plot_df, aes(x = demo, y = above, fill = effect))
gg <- gg + geom_tile(color = "transparent", size = 0.1)
gg <- gg + labs(x = "Demographic", y = "Social metric over the line", fill = "Effect")
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1))
gg

From this AB test we would conclude that change to the recommendation engine did indeed increase the number of hours watched. Especially for those whose social metric belongs 5-10 with an older demographic. Better recommendations improve user engagement and importantly increase the average hours watched per user per day.

This is an actionable insight that would help the business to make decision whether new recommendation engine algorithm is worth rolling out to all their subscribers and in the long term regarding better targeting of subscribers.

Discussion

References